{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap4 编写结构化的程序\n",
    "1.  怎样才能写出结构良好，可读性强的程序，从而方便重用？\n",
    "2.  基本的结构块，例如：循环、函数和赋值是如何执行的？\n",
    "3.  Python 编程的陷阱还有哪些，如何避免它们？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 4.5 更多函数操作(P164)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 作为参数的函数\n",
    "sent = ['Take', 'care', 'of', 'the', 'sense', ',',\n",
    "        'and', 'the', 'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']\n",
    "\n",
    "\n",
    "def extract_property(prop):\n",
    "    return [prop(word) for word in sent]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "extract_property(len)=  [4, 4, 2, 3, 5, 1, 3, 3, 6, 4, 4, 4, 2, 10, 1]\n"
     ]
    }
   ],
   "source": [
    "# 将函数作为参数传入，并且在函数内调用\n",
    "print(\"extract_property(len)= \", extract_property(len))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "extract_property(last_letter)=  ['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']\nextract_property(lambda w: w[-1])=  ['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']\n"
     ]
    }
   ],
   "source": [
    "def last_letter(word):\n",
    "    return word[-1]\n",
    "\n",
    "\n",
    "# 将自定义函数作为参数传入，并且在函数内调用\n",
    "print(\"extract_property(last_letter)= \", extract_property(last_letter))\n",
    "# 将lambda表达式作为参数传入，并且在函数内调用\n",
    "print(\"extract_property(lambda w: w[-1])= \", extract_property(lambda w: w[-1]))"
   ]
  },
  {
   "source": [
    "### 4.5.2 累积函数"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "search1:\n",
      "(0)Grizzlies', (1)fizzled, (2)Rizzuto, (3)huzzahs, (4)dazzler, \n",
      "(5)jazz, (6)Pezza, (7)Pezza, (8)Pezza, (9)embezzling, \n",
      "(10)embezzlement, (11)pizza, (12)jazz, (13)Ozzie, (14)nozzle, \n",
      "(15)drizzly, (16)puzzle, (17)puzzle, (18)dazzling, (19)Sizzling, \n",
      "(20)guzzle, (21)puzzles, (22)dazzling, (23)jazz, (24)jazz, \n",
      "(25)Jazz, (26)jazz, (27)Jazz, (28)jazz, (29)jazz, \n"
     ]
    }
   ],
   "source": [
    "# Ex4-5 累积输出到一个链表\n",
    "def search1(substring, words):\n",
    "    result = []\n",
    "    for word in words:\n",
    "        if substring in word:\n",
    "            result.append(word)\n",
    "    return result\n",
    "\n",
    "\n",
    "# 全部搜索完毕再输出\n",
    "print(\"search1:\")\n",
    "for i, item in enumerate(search1('zz', nltk.corpus.brown.words())):\n",
    "    if i < 30:\n",
    "        print(f\"({i}){item}\", end=', ')\n",
    "        if i % 5 == 4:\n",
    "            print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "search2:\n",
      "(0)Grizzlies', (1)fizzled, (2)Rizzuto, (3)huzzahs, (4)dazzler, \n",
      "(5)jazz, (6)Pezza, (7)Pezza, (8)Pezza, (9)embezzling, \n",
      "(10)embezzlement, (11)pizza, (12)jazz, (13)Ozzie, (14)nozzle, \n",
      "(15)drizzly, (16)puzzle, (17)puzzle, (18)dazzling, (19)Sizzling, \n",
      "(20)guzzle, (21)puzzles, (22)dazzling, (23)jazz, (24)jazz, \n",
      "(25)Jazz, (26)jazz, (27)Jazz, (28)jazz, (29)jazz, \n"
     ]
    }
   ],
   "source": [
    "# 生成器函数的累积输出\n",
    "# 函数只产生调用程序需要的数据，并不需要分配额外的内存来存储输出\n",
    "def search2(substring, words):\n",
    "    for word in words:\n",
    "        if substring in word:\n",
    "            yield word\n",
    "\n",
    "\n",
    "# 按照次序找到一个就输出一个\n",
    "print(\"search2:\")\n",
    "i = 0\n",
    "for i, item in enumerate(search2('zz', nltk.corpus.brown.words())):\n",
    "    if i < 30:\n",
    "        print(f\"({i}){item}\", end=', ')\n",
    "        if i % 5 == 4:\n",
    "            print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "1-permutation= [['police']]\n2-permutations= [['police', 'fish'], ['fish', 'police']]\n3-permutations= [['police', 'fish', 'buffalo'], ['fish', 'police', 'buffalo'], ['fish', 'buffalo', 'police'], ['police', 'buffalo', 'fish'], ['buffalo', 'police', 'fish'], ['buffalo', 'fish', 'police']]\n"
     ]
    }
   ],
   "source": [
    "# 更复杂的生成器的例子\n",
    "def permutations(seq):\n",
    "    if len(seq) <= 1:\n",
    "        yield seq\n",
    "    else:\n",
    "        for perm in permutations(seq[1:]):\n",
    "            for i in range(len(perm) + 1):\n",
    "                yield perm[:i] + seq[0:1] + perm[i:]\n",
    "\n",
    "\n",
    "print(\"1-permutation=\", list(permutations(['police'])))\n",
    "print(\"2-permutations=\", list(permutations(['police', 'fish'])))\n",
    "print(\"3-permutations=\", list(permutations(['police', 'fish', 'buffalo'])))"
   ]
  },
  {
   "source": [
    "### 4.5.3 高阶函数，函数式编程，Haskell"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "高阶函数=  ['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']\n等价输出=  ['Take', 'care', 'sense', 'sounds', 'take', 'care', 'themselves']\n"
     ]
    }
   ],
   "source": [
    "sent = ['Take', 'care', 'of', 'the', 'sense', ',',\n",
    "        'and', 'the', 'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']\n",
    "\n",
    "\n",
    "# 检查一个词是否来自一个开放的实词类\n",
    "def is_content_word(word):\n",
    "    return word.lower() not in ['a', 'of', 'the', 'and', 'will', ',', '.']\n",
    "\n",
    "\n",
    "# 将is_content_word()函数作为参数传入filter()函数中，就是高阶函数\n",
    "print(\"高阶函数= \", list(filter(is_content_word, sent)))\n",
    "\n",
    "# 下面是等价的代码\n",
    "word_list = [\n",
    "        w\n",
    "        for w in sent\n",
    "        if is_content_word(w)\n",
    "]\n",
    "print(\"等价输出= \", word_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "21.75081116158339"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "# 将len()函数作为参数传入 map()函数中，就是高阶函数\n",
    "lengths = list(map(len, nltk.corpus.brown.sents(categories='news')))\n",
    "sum(lengths) / len(lengths)  # 布朗语料库中句子的平均长度\n",
    "\n",
    "# list()函数不支持将函数作为参数传入\n",
    "# list(len, ['a', 'b', 'c'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "21.75081116158339"
      ]
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "source": [
    "# 下面是等价的代码\n",
    "lengths = [\n",
    "        len(w) \n",
    "        for w in nltk.corpus.brown.sents(categories='news')]\n",
    "sum(lengths) / len(lengths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "2\n[2, 2, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 1, 3, 0]\n"
     ]
    }
   ],
   "source": [
    "# 多重函数嵌套\n",
    "print(len(list(filter(lambda c: c.lower() in \"aeiou\", 'their'))))\n",
    "print(list(map(lambda w: len(list(filter(lambda c: c.lower() in \"aeiou\", w))), sent)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[2, 2, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 1, 3, 0]\n[['a', 'e'], ['a', 'e'], ['o'], ['e'], ['e', 'e'], [], ['a'], ['e'], ['o', 'u'], ['i'], ['a', 'e'], ['a', 'e'], ['o'], ['e', 'e', 'e'], []]\n['a', 'e', 'a', 'e', 'o', 'e', 'e', 'e', 'a', 'e', 'o', 'u', 'i', 'a', 'e', 'a', 'e', 'o', 'e', 'e', 'e']\n"
     ]
    }
   ],
   "source": [
    "# 下面是等价的代码，以链表推导为基础的解决方案通常比基于高阶函数的解决方案可读性更好\n",
    "print([\n",
    "        len([\n",
    "                c\n",
    "                for c in w\n",
    "                if c.lower() in \"aeiou\"\n",
    "        ])\n",
    "        for w in sent\n",
    "])\n",
    "\n",
    "print([\n",
    "        [\n",
    "                c\n",
    "                for c in w\n",
    "                if c.lower() in \"aeiou\"\n",
    "        ]\n",
    "        for w in sent\n",
    "])\n",
    "\n",
    "# 错误的输出结果\n",
    "print([\n",
    "        c\n",
    "        for w in sent\n",
    "        for c in w\n",
    "        if c.lower() in \"aeiou\"\n",
    "])"
   ]
  },
  {
   "source": [
    "### 4.5.4 参数的命名(P167)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "repeat(num=3)=  <empty><empty><empty>\nrepeat(msg='Alice')=  Alice\nrepeat(num=3, msg='Alice')=  AliceAliceAlice\n"
     ]
    }
   ],
   "source": [
    "# 关键字参数：就是通过名字引用参数，\n",
    "# 可以防止参数混淆，还可以指定默认值，还可以按做生意顺序访问参数\n",
    "def repeat(msg='<empty>', num=1):\n",
    "    return msg * num\n",
    "\n",
    "\n",
    "print(\"repeat(num=3)= \", repeat(num=3))\n",
    "print(\"repeat(msg='Alice')= \", repeat(msg='Alice'))\n",
    "print(\"repeat(num=3, msg='Alice')= \", repeat(num=3, msg='Alice'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "list(dict)=  ['a', 'b', 'c']\ndict.keys()=  dict_keys(['a', 'b', 'c'])\ndict.values()=  dict_values([1, 2, 3])\ndict.items()=  dict_items([('a', 1), ('b', 2), ('c', 3)])\ndict['a']=  1\ndict.get('a')=  1\n"
     ]
    }
   ],
   "source": [
    "dict = {'a': 1, 'b': 2, 'c': 3}\n",
    "print(\"list(dict)= \", list(dict))\n",
    "print(\"dict.keys()= \", dict.keys())\n",
    "print(\"dict.values()= \", dict.values())\n",
    "print(\"dict.items()= \", dict.items())\n",
    "print(\"dict['a']= \", dict['a'])\n",
    "print(\"dict.get('a')= \", dict.get('a'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "value=1\nvalue=African swallow\nkwargs['monty']=  Python\nname:value=monty: Python\nname:value=lover: sy\n"
     ]
    }
   ],
   "source": [
    "# 参数元组：*args 和 关键字参数字典：*kwargs\n",
    "def generic(*args, **kwargs):\n",
    "    for value in args:\n",
    "        print('value={}'.format(value))\n",
    "    print(\"kwargs['monty']= \",kwargs['monty'])\n",
    "    for name in list(kwargs):\n",
    "        print('name:value={}: {}'.format(name, kwargs[name]))\n",
    "\n",
    "\n",
    "generic(1, \"African swallow\", monty='Python', lover='sy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "list(zip(song[0], song[1], song[2]))=  [('four', 'three', 'two'), ('calling', 'French', 'turtle'), ('birds', 'hens', 'doves')]\nlist(zip(*song))=  [('four', 'three', 'two', 'one'), ('calling', 'French', 'turtle', 'tiger'), ('birds', 'hens', 'doves', 'animal')]\nlist(zip(song))=  [(['four', 'calling', 'birds'],), (['three', 'French', 'hens'],), (['two', 'turtle', 'doves'],), (['one', 'tiger', 'animal'],)]\n"
     ]
    }
   ],
   "source": [
    "# zip() 函数对 *args 的支持\n",
    "# *song 表示分解输入数据，等价于song[0],song[1],song[2],song[3]，分组的数目就是输入的数目\n",
    "song = [['four', 'calling', 'birds'], ['three', 'French', 'hens'], ['two', 'turtle', 'doves'],\n",
    "        ['one', 'tiger', 'animal']]\n",
    "print(\"list(zip(song[0], song[1], song[2]))= \", list(zip(song[0], song[1], song[2])))\n",
    "print(\"list(zip(*song))= \", list(zip(*song)))\n",
    "print(\"list(zip(song))= \", list(zip(song)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "freq_words('func.py', 4, 10)=  [('return', 9), ('word', 7), ('text', 5), ('my_text_data', 3), ('w.lower', 3), ('zYx.Tom', 2), ('word_count', 2), ('vocab_size', 2), ('diversity_score', 2), ('count', 2)]\nfreq_words('func.py', min=4, num=10)=  [('return', 9), ('word', 7), ('text', 5), ('my_text_data', 3), ('w.lower', 3), ('zYx.Tom', 2), ('word_count', 2), ('vocab_size', 2), ('diversity_score', 2), ('count', 2)]\nfreq_words('func.py', num=10, min=4)=  [('return', 9), ('word', 7), ('text', 5), ('my_text_data', 3), ('w.lower', 3), ('zYx.Tom', 2), ('word_count', 2), ('vocab_size', 2), ('diversity_score', 2), ('count', 2)]\n"
     ]
    }
   ],
   "source": [
    "# 三种等效的方法调用函数 freq_words()\n",
    "def freq_words(file, min=1, num=10):\n",
    "    text = open(file, encoding='utf-8').read()\n",
    "    tokens = nltk.word_tokenize(text)\n",
    "    freqdist = nltk.FreqDist(t for t in tokens if len(t) >= min)\n",
    "    return freqdist.most_common(num)\n",
    "\n",
    "\n",
    "print(\"freq_words('func.py', 4, 10)= \", freq_words('func.py', 4, 10))\n",
    "print(\"freq_words('func.py', min=4, num=10)= \", freq_words('func.py', min=4, num=10))\n",
    "print(\"freq_words('func.py', num=10, min=4)= \", freq_words('func.py', num=10, min=4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Opening func.py\nRead in 7 characters\n............................................................................................................\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[('return', 9),\n",
       " ('word', 7),\n",
       " ('text', 5),\n",
       " ('my_text_data', 3),\n",
       " ('w.lower', 3),\n",
       " ('zYx.Tom', 2),\n",
       " ('word_count', 2),\n",
       " ('vocab_size', 2),\n",
       " ('diversity_score', 2),\n",
       " ('count', 2)]"
      ]
     },
     "metadata": {},
     "execution_count": 19
    }
   ],
   "source": [
    "# 增加了 verbose 参数，可以设置为调试开关\n",
    "def freq_words(file, min=1, num=10, verbose=False):\n",
    "    freqdist = nltk.FreqDist()\n",
    "    if verbose:\n",
    "        print('Opening', file)\n",
    "    with open(file, encoding='utf-8') as f:\n",
    "        text = f.read()\n",
    "    if verbose:\n",
    "        print('Read in %d characters' % len(file))\n",
    "    for word in nltk.word_tokenize(text):\n",
    "        if len(word) >= min:\n",
    "            freqdist[word] += 1\n",
    "            if verbose and freqdist.N() % 100 != 0:\n",
    "                print('.', end='')\n",
    "    if verbose:\n",
    "        print()\n",
    "    return freqdist.most_common(num)\n",
    "\n",
    "\n",
    "freq_words('func.py', 4, 10, True)"
   ]
  }
 ]
}